In [1]:
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import roc_curve, auc

PRE-PROCESSING¶

In [3]:
# DATA Pre-Processing

with open('/content/sample_data/drug_consumption.data', 'r') as file:
    data = file.read()

# Naming all the columns with their associated names

column_name = ['ID','Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing','Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']

df = pd.read_csv(StringIO(data), header=None)


df.columns = column_name
df2=df.copy(deep=True)

### Mapping the columns with the values associated with the given data.

df.loc[df['Age'] == -0.95197, 'Age'] = "18-24"
df.loc[df['Age'] == -0.07854, 'Age'] = "25-34"
df.loc[df['Age'] == 0.49788, 'Age'] = "35-44"
df.loc[df['Age'] == 1.09449, 'Age'] = "45-54"
df.loc[df['Age'] == 1.82213, 'Age'] = "55-64"
df.loc[df['Age'] == 2.59171, 'Age'] = "65+"

df.loc[df['Sex'] == -0.48246, 'Sex'] = "Male"
df.loc[df['Sex'] == 0.48246, 'Sex'] = "Female"

education_mapping = {
    -2.43591: 'Left school before 16 years',
    -1.73790: 'Left school at 16 years',
    -1.43719: 'Left school at 17 years',
    -1.22751: 'Left school at 18 years',
    -0.61113: 'Some college or university, no certificate or degree',
    -0.05921: 'Professional certificate/ diploma',
    0.45468: 'University degree',
    1.16365: 'Masters degree',
    1.98437: 'Doctorate degree'
}
df['Education'] = df['Education'].replace(education_mapping)

country_mapping = {
    -0.09765: 'Australia',
    0.24923: 'Canada',
    -0.46841: 'New Zealand',
    -0.28519: 'Other',
    0.21128: 'Republic of Ireland',
    0.96082: 'UK',
    -0.57009: 'USA',
}
df['Country'] = df['Country'].replace(country_mapping)

ethnicity_mapping = {
    -0.50212: 'Asian',
    -1.10702: 'Black',
    1.90725: 'Mixed-Black/Asian',
    0.12600: 'Mixed-White/Asian',
    -0.22166: 'Mixed-White/Black',
    0.11440: 'Other',
    -0.31685: 'White',
}
df['Ethnicity'] = df['Ethnicity'].replace(ethnicity_mapping)

drugs_mapping = {
    'CL0': 'Never Used',
    'CL1': 'Used over a Decade Ago',
    'CL2': 'Used in Last Decade',
    'CL3': 'Used in Last Year',
    'CL4': 'Used in Last Month',
    'CL5': 'Used in Last Week',
    'CL6': 'Used in Last Day',
}

columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']

for col in columns_to_categorize:
   df[col] = df[col].replace(drugs_mapping)

# For all the columns with test scores we search on the internet the results considered as High, Average and Low which was High was when the score > mean+std , average = mean-std > score < mean+std, low = score < mean-std
columns_to_categorize_s = ['Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness','Impulsivness','Sensation Seeing']

# Apply the categorization to each specified column
for col in columns_to_categorize_s:
    bins = [-float("inf"), df[col].mean() - df[col].std(), df[col].mean() + df[col].std(), float("inf")]
    labels = ['Low '+col, 'Average '+col, 'High '+col]
    df[col] = pd.cut(df[col], bins=bins, labels=labels)

df = df.set_index('ID')
df_standard = df.copy(deep=True)
df_standard.to_csv('df_standard.csv', index=False)

# Examen des valeurs uniques pour certaines colonnes catégorielles
unique_values = {}
columns_to_check = ['Age', 'Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness', 'Alcohol', 'Caffeine', 'Nicotine']
for col in columns_to_check:
    unique_values[col] = df[col].unique()

unique_values
Out[3]:
{'Age': array(['35-44', '25-34', '18-24', '65+', '45-54', '55-64'], dtype=object),
 'Sex': array(['Female', 'Male'], dtype=object),
 'Education': array(['Professional certificate/ diploma', 'Doctorate degree',
        'Masters degree', 'Left school at 18 years',
        'Left school at 16 years', 'University degree',
        'Some college or university, no certificate or degree',
        'Left school before 16 years', 'Left school at 17 years'],
       dtype=object),
 'Country': array(['UK', 'Canada', 'USA', 'Other', 'Australia', 'Republic of Ireland',
        'New Zealand'], dtype=object),
 'Ethnicity': array(['Mixed-White/Asian', 'White', 'Other', 'Mixed-White/Black',
        'Asian', 'Black', 'Mixed-Black/Asian'], dtype=object),
 'Neuroticism': ['Average Neuroticism', 'Low Neuroticism', 'High Neuroticism']
 Categories (3, object): ['Low Neuroticism' < 'Average Neuroticism' < 'High Neuroticism'],
 'Extraversion': ['Average Extraversion', 'High Extraversion', 'Low Extraversion']
 Categories (3, object): ['Low Extraversion' < 'Average Extraversion' < 'High Extraversion'],
 'Openness': ['Average Openness', 'High Openness', 'Low Openness']
 Categories (3, object): ['Low Openness' < 'Average Openness' < 'High Openness'],
 'Agreeableness': ['Average Agreeableness', 'Low Agreeableness', 'High Agreeableness']
 Categories (3, object): ['Low Agreeableness' < 'Average Agreeableness' < 'High Agreeableness'],
 'Conscientiousness': ['Average Conscientiousness', 'Low Conscientiousness', 'High Conscientiousness']
 Categories (3, object): ['Low Conscientiousness' < 'Average Conscientiousness' < 'High Conscientiousness'],
 'Alcohol': array(['Used in Last Week', 'Used in Last Day', 'Used in Last Month',
        'Used in Last Decade', 'Used over a Decade Ago', 'Never Used',
        'Used in Last Year'], dtype=object),
 'Caffeine': array(['Used in Last Day', 'Used in Last Week', 'Used in Last Month',
        'Used in Last Year', 'Never Used', 'Used over a Decade Ago',
        'Used in Last Decade'], dtype=object),
 'Nicotine': array(['Used in Last Decade', 'Used in Last Month', 'Never Used',
        'Used in Last Day', 'Used over a Decade Ago', 'Used in Last Year',
        'Used in Last Week'], dtype=object)}

DATASET STUDY¶

In [4]:
df1 = df.copy(deep=True)

counts_by_country = df1['Country'].value_counts()
df1['CountryWithCount'] = df1['Country'].apply(lambda x: f"{x} ({counts_by_country[x]})")

fig = px.pie(df1, names="CountryWithCount", color="CountryWithCount", title="Pourcentage of people surveyed by country")
fig.show()

bar_fig = px.histogram(df1, x="CountryWithCount", color="CountryWithCount", title="Number of people surveyed by country")
bar_fig.update_layout(xaxis_title="Country", yaxis_title="Count")
bar_fig.show()

In this graph we see that the people from New Zealand represent 0.265% of the people of our survey and the people from the Republic of Ireland represent 1.06% of the answer. In order to have a more efficient model we set those peoples nationalities to other.

In [5]:
df1 = df.copy(deep=True)

counts_by_country = df1['Age'].value_counts()
df1['AgeWithCount'] = df1['Age'].apply(lambda x: f"{x} ({counts_by_country[x]})")

fig = px.pie(df1, names="AgeWithCount", color="AgeWithCount", title="Pourcentage of people surveyed by Age")
fig.show()

fig = px.histogram(df1, x="AgeWithCount", color="AgeWithCount", title="Number of people surveyed by Age")
fig.update_layout(xaxis_title="Age", yaxis_title="Count")
fig.show()

In this graph we see that the people who are more than 65 represent less than 1% of the people of our survey. In order to have a more efficient model we have to create a new category "55+" which englobes people from "55-64" and people from "65+".

In [6]:
df1 = df.copy(deep=True)

counts_by_country = df1['Education'].value_counts()
df1['EducationWithCount'] = df1['Education'].apply(lambda x: f"{x} ({counts_by_country[x]})")

fig = px.pie(df1, names="EducationWithCount", color="EducationWithCount", title="Pourcentage of people surveyed by Education")
fig.show()

fig = px.histogram(df1, x="EducationWithCount", color="EducationWithCount", title="Number of people surveyed by Education")
fig.update_layout(xaxis_title="Education", yaxis_title="Count")
fig.show()

We create a new category "Left school before 18" which englobes "left school at 17"+"Left school before 17"+"Left school before 16"

In [7]:
df_user = df.copy(deep=True)

drug_using = ['User_Alcohol','User_Amphet', 'User_Amyl', 'User_Benzos', 'User_Caff', 'User_Cannabis', 'User_Choc', 'User_Coke', 'User_Crack','User_Ecstasy', 'User_Heroin', 'User_Ketamine', 'User_Legalh', 'User_LSD', 'User_Meth', 'User_Mushrooms','User_Nicotine', 'User_Semer', 'User_VSA']

for i in range(len(columns_to_categorize)):
    df_user.loc[((df_user[columns_to_categorize[i]]=="Never Used") | (df_user[columns_to_categorize[i]]=="Used over a Decade Ago")),drug_using[i]] = 'Non-user'
    df_user.loc[((df_user[columns_to_categorize[i]]=="Used in Last Year") | (df_user[columns_to_categorize[i]]=="Used in Last Decade") | (df_user[columns_to_categorize[i]]=="Used in Last Day") | (df_user[columns_to_categorize[i]]=="Used in Last Week") | (df_user[columns_to_categorize[i]]=="Used in Last Month")),drug_using[i]] = 'User'
In [8]:
count_of_users = []
count_of_non_users = []
for i in range(len(columns_to_categorize)):
    s = df_user.groupby([drug_using[i]])[columns_to_categorize[i]].count()
    count_of_users.append(s[1])
    count_of_non_users.append(s[0])
In [9]:
trace1 = go.Bar(
    x=columns_to_categorize,
    y=count_of_users,
    name='User',
    marker = dict(color="rgba(220, 20, 60, 0.7)")
)
trace2 = go.Bar(
    x=columns_to_categorize,
    y=count_of_non_users,
    name='Non-User',
    marker = dict(color="rgba(0, 128, 0, 0.7)")
)

data = [trace1, trace2]
layout = go.Layout(
    title= 'Drug Vs User Or Non-user',
    yaxis=dict(title='Count', ticklen=5, gridwidth=2),
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
fig.show()

MODIFY DATAFRAME¶

Dataframe for Visualization¶

In [10]:
df4=df.copy(deep=True)
In [11]:
drugs_mapping = {
    'Never Used': 0,
    'Used over a Decade Ago': 1,
    'Used in Last Decade': 2,
    'Used in Last Year': 3,
    'Used in Last Month': 4,
    'Used in Last Week': 5,
    'Used in Last Day': 6,
}

columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
for col in columns_to_categorize:
   df4[col] = df4[col].replace(drugs_mapping)

df4.head()
Out[11]:
Age Sex Education Country Ethnicity Neuroticism Extraversion Openness Agreeableness Conscientiousness ... Ecstasy Heroin Ketamine Legal Highs LSD Methadone Mushrooms Nicotine Semer VSA
ID
1 35-44 Female Professional certificate/ diploma UK Mixed-White/Asian Average Neuroticism Average Extraversion Average Openness Average Agreeableness Average Conscientiousness ... 0 0 0 0 0 0 0 2 0 0
2 25-34 Male Doctorate degree UK White Average Neuroticism High Extraversion High Openness Average Agreeableness Average Conscientiousness ... 4 0 2 0 2 3 0 4 0 0
3 35-44 Male Professional certificate/ diploma UK White Average Neuroticism Average Extraversion Average Openness Low Agreeableness Low Conscientiousness ... 0 0 0 0 0 0 1 0 0 0
4 18-24 Female Masters degree UK White Average Neuroticism Average Extraversion Average Openness Average Agreeableness Average Conscientiousness ... 0 0 2 0 0 0 0 2 0 0
5 35-44 Female Doctorate degree UK White Average Neuroticism Low Extraversion Average Openness Average Agreeableness High Conscientiousness ... 1 0 0 1 0 0 2 2 0 0

5 rows × 31 columns

In [12]:
df4.replace({'New Zealand': 'Other', 'Republic of Ireland':'Other'}, inplace=True)
df4.replace({'55-64':'55+','65+':'55+'},inplace=True)
df4.replace({"Left school at 17 years":"Left school before 18 years", "Left school at 16 years":"Left school before 18 years","Left school before 16 years":"Left school before 18 years"}, inplace=True)

Dataframe for Modelization¶

In [13]:
drugs_mapping = {
    'CL0': 0,
    'CL1': 1,
    'CL2': 2,
    'CL3': 3,
    'CL4': 4,
    'CL5': 5,
    'CL6': 6,
}

columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
for col in columns_to_categorize:
   df2[col] = df2[col].replace(drugs_mapping)

df2.head()
Out[13]:
ID Age Sex Education Country Ethnicity Neuroticism Extraversion Openness Agreeableness ... Ecstasy Heroin Ketamine Legal Highs LSD Methadone Mushrooms Nicotine Semer VSA
0 1 0.49788 0.48246 -0.05921 0.96082 0.12600 0.31287 -0.57545 -0.58331 -0.91699 ... 0 0 0 0 0 0 0 2 0 0
1 2 -0.07854 -0.48246 1.98437 0.96082 -0.31685 -0.67825 1.93886 1.43533 0.76096 ... 4 0 2 0 2 3 0 4 0 0
2 3 0.49788 -0.48246 -0.05921 0.96082 -0.31685 -0.46725 0.80523 -0.84732 -1.62090 ... 0 0 0 0 0 0 1 0 0 0
3 4 -0.95197 0.48246 1.16365 0.96082 -0.31685 -0.14882 -0.80615 -0.01928 0.59042 ... 0 0 2 0 0 0 0 2 0 0
4 5 0.49788 0.48246 1.98437 0.96082 -0.31685 0.73545 -1.63340 -0.45174 -0.30172 ... 1 0 0 1 0 0 2 2 0 0

5 rows × 32 columns

In [14]:
df2['Country'].replace({-0.46841: -0.28519, 0.21128:-0.28519}, inplace=True)
df2['Age'].replace({2.59171: 1.82213},inplace=True)
df2['Education'].replace({-2.43591:-1.43719,-1.73790:-1.43719}, inplace=True)

Dataframe for Percentage of Use prediction¶

In [15]:
df3=df.copy(deep=True)
df3.replace({'New Zealand': 'Other', 'Republic of Ireland':'Other'}, inplace=True)
df3['Country'].unique()
Out[15]:
array(['UK', 'Canada', 'USA', 'Other', 'Australia'], dtype=object)
In [16]:
df3.replace({'55-64':'55+','65+':'55+'},inplace=True)
df3['Age'].unique()
Out[16]:
array(['35-44', '25-34', '18-24', '55+', '45-54'], dtype=object)
In [17]:
df3.replace({"Left school at 17 years":"Left school before 18 years", "Left school at 16 years":"Left school before 18 years","Left school before 16 years":"Left school before 18 years"}, inplace=True)

Heatmap¶

In [18]:
column_name = ['ID','Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing','Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
with open('/content/sample_data/drug_consumption.data', 'r') as file:
    data = file.read()
df_for_heat = pd.read_csv(StringIO(data), header=None)
df_for_heat.columns = column_name
mapping = {
    "CL0": 0,
    "CL1": 1,
    "CL2": 2,
    "CL3": 3,
    "CL4": 4,
    "CL5": 5,
    "CL6": 6
}
country_mapping_heat = {
    -0.46841: -0.28519,
    0.21128: -0.28519,
}
education_mapping_heat = {
    -2.43591: -1.43719,
    -1.73790: -1.43719,
}

age_mapping_heat = {
    2.59171: 1.82213
}

df_for_heat = df_for_heat.replace(mapping)
df_for_heat['Country'] = df_for_heat['Country'].replace(country_mapping_heat)
df_for_heat['Education'] = df_for_heat['Education'].replace(education_mapping_heat)
df_for_heat['Age'] = df_for_heat['Age'].replace(age_mapping_heat)


df_for_heat = df_for_heat.drop("ID",axis=1)
df_for_heat = df_for_heat.reset_index(drop=True)
df_for_heat_reset = df_for_heat.reset_index(drop=True)

df_for_heat_corr = df_for_heat_reset.corr()
plt.figure(figsize=(20, 20))
fig = sns.heatmap(df_for_heat_corr, cmap='coolwarm', annot=True)
plt.show()

Data Modelization¶

Predict any drug use using best features¶

Standardizing the heatmap by drug

In [19]:
for drug in columns_to_categorize:
  mean=df_for_heat_corr[drug].mean()
  std=df_for_heat_corr[drug].std(ddof=0)
  df_for_heat_corr[drug]=(df_for_heat_corr[drug]-mean)/std

Creating the dictionary that will have the pertaining features for each drug

In [20]:
dfs={}

for drug in columns_to_categorize:
  drugs=columns_to_categorize[:]
  drugs.remove(drug)
  inter_down=df_for_heat_corr[drug].quantile(0.4)
  inter_up=df_for_heat_corr[drug].quantile(0.6)
  temp_df=df_for_heat_corr.copy(deep=True)
  temp_df=temp_df[(temp_df[drug]>inter_up)|(temp_df[drug]<inter_down)]
  temp_df.drop(columns=drugs,axis=1,inplace=True)
  temp_df.drop(columns=['Age','Sex','Education','Country','Ethnicity','Neuroticism',  'Extraversion',  'Openness',  'Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing'],axis=1,inplace=True)
  dfs[f"df_{drug}"]=temp_df
In [22]:
for drug in columns_to_categorize:
  print(drug,len(dfs[f'df_{drug}']), dfs[f'df_{drug}'])
Alcohol 24                     Alcohol
Age               -0.493926
Sex               -0.370167
Education          0.364151
Country            0.086219
Ethnicity         -0.053963
Neuroticism       -0.366489
Extraversion       0.161909
Agreeableness     -0.478270
Conscientiousness -0.357074
Sensation Seeing   0.252624
Alcohol            5.254320
Amphetamines      -0.411966
Amyl Nitrite       0.130753
Benzodiazepine    -0.413322
Caffeine           0.356214
Cocaine            0.146659
Crack             -0.468818
Ecstasy            0.056900
Heroin            -0.545480
Ketamine           0.013752
LSD               -0.287919
Methadone         -0.799265
Nicotine          -0.006619
Semer             -0.558521
Amphetamines 24                    Amphetamines
Age                   -1.501856
Sex                   -1.424431
Education             -1.236616
Country               -2.010839
Ethnicity             -0.398753
Extraversion          -0.818260
Agreeableness         -1.178505
Conscientiousness     -1.470542
Alcohol               -0.714279
Amphetamines           2.661744
Benzodiazepine         0.992571
Caffeine              -0.486889
Canabis                0.851960
Chocolate             -0.882213
Cocaine                1.093883
Ecstasy                1.062414
Heroin                 0.525452
Ketamine               0.567873
Legal Highs            0.906835
LSD                    0.721730
Methadone              0.648377
Mushrooms              0.749677
Nicotine               0.496546
Semer                 -0.581816
Amyl Nitrite 24                    Amyl Nitrite
Age                   -1.148064
Sex                   -1.467539
Education             -0.690678
Ethnicity             -0.501307
Neuroticism           -0.527611
Extraversion          -0.542598
Openness              -0.396912
Agreeableness         -1.149406
Conscientiousness     -1.253918
Sensation Seeing       0.250811
Amphetamines           0.831473
Amyl Nitrite           4.118429
Benzodiazepine         0.356920
Canabis                0.370838
Chocolate             -0.678906
Cocaine                1.125521
Ecstasy                1.037419
Ketamine               0.964293
Legal Highs            0.562994
LSD                    0.088099
Methadone             -0.413390
Mushrooms              0.331973
Nicotine               0.385331
Semer                 -0.656507
Benzodiazepine 24                    Benzodiazepine
Age                     -1.149679
Sex                     -1.171353
Education               -1.192893
Country                 -2.137862
Ethnicity               -0.254375
Extraversion            -1.061291
Agreeableness           -1.283306
Conscientiousness       -1.435523
Alcohol                 -0.722422
Amphetamines             1.134811
Benzodiazepine           2.950190
Caffeine                -0.483413
Canabis                  0.618258
Chocolate               -0.874508
Cocaine                  0.925654
Crack                    0.578949
Ecstasy                  0.559536
Heroin                   0.867682
Legal Highs              0.614452
LSD                      0.415335
Methadone                1.202613
Mushrooms                0.566929
Nicotine                 0.418026
Semer                   -0.590376
Caffeine 24                    Caffeine
Sex               -0.465533
Country           -0.385860
Ethnicity          0.082401
Neuroticism       -0.326510
Extraversion      -0.089456
Agreeableness     -0.494194
Conscientiousness -0.560497
Sensation Seeing  -0.102616
Alcohol            0.327112
Amphetamines      -0.068243
Amyl Nitrite       0.057064
Benzodiazepine    -0.082728
Caffeine           5.337020
Chocolate          0.260328
Cocaine            0.039226
Crack             -0.317795
Heroin            -0.317034
Ketamine          -0.320970
Legal Highs       -0.362575
LSD               -0.427172
Methadone         -0.268187
Nicotine           0.325574
Semer             -0.464896
VSA               -0.072721
Canabis 24                     Canabis
Age               -1.885530
Sex               -1.457895
Education         -1.401661
Country           -2.198102
Neuroticism       -0.287616
Extraversion      -0.611500
Openness           0.651365
Agreeableness     -1.006721
Conscientiousness -1.378918
Impulsivness       0.345961
Sensation Seeing   0.775061
Alcohol           -0.501504
Amphetamines       0.782250
Benzodiazepine     0.487486
Caffeine          -0.449722
Canabis            2.377802
Chocolate         -0.758653
Cocaine            0.753369
Ecstasy            1.060449
Legal Highs        1.063601
LSD                0.966351
Mushrooms          1.139886
Nicotine           0.948955
Semer             -0.439579
Chocolate 24                 Chocolate
Age              0.172826
Sex              0.299880
Education        0.081048
Country          0.558245
Ethnicity        0.033495
Neuroticism     -0.029723
Extraversion     0.011328
Agreeableness    0.096180
Alcohol          0.147315
Amphetamines    -0.416793
Amyl Nitrite    -0.086846
Benzodiazepine  -0.373392
Caffeine         0.516325
Canabis         -0.438459
Chocolate        5.219498
Cocaine         -0.427750
Crack           -0.724267
Ecstasy         -0.357804
Heroin          -0.512602
Legal Highs     -0.392684
LSD             -0.512451
Methadone       -0.332792
Mushrooms       -0.489921
VSA             -0.489911
Cocaine 24                     Cocaine
Age               -1.546011
Sex               -1.396869
Education         -1.182455
Country           -1.752817
Ethnicity         -0.477363
Extraversion      -0.672428
Agreeableness     -1.471679
Conscientiousness -1.461494
Alcohol           -0.467833
Amphetamines       1.072016
Benzodiazepine     0.765551
Caffeine          -0.512549
Canabis            0.785323
Chocolate         -0.997693
Cocaine            2.708446
Crack              0.581669
Ecstasy            1.350536
Heroin             0.699633
Ketamine           0.760482
Legal Highs        0.660420
LSD                0.559424
Mushrooms          0.707899
Nicotine           0.619524
Semer             -0.618320
Crack 24                       Crack
Age               -0.822680
Sex               -1.247222
Education         -1.229910
Country           -1.568633
Ethnicity         -0.469025
Extraversion      -0.829153
Agreeableness     -1.049848
Conscientiousness -1.172082
Alcohol           -0.700647
Amphetamines       0.609413
Benzodiazepine     0.831823
Caffeine          -0.556208
Canabis            0.360121
Chocolate         -1.108672
Cocaine            1.009550
Crack              3.549062
Ecstasy            0.351305
Heroin             1.579097
LSD                0.342457
Methadone          0.866032
Mushrooms          0.472991
Nicotine           0.387227
Semer             -0.506687
VSA                0.425834
Ecstasy 24                     Ecstasy
Age               -1.925570
Sex               -1.422561
Education         -1.253586
Country           -1.774366
Ethnicity         -0.465697
Neuroticism       -0.471680
Agreeableness     -1.059359
Conscientiousness -1.386760
Sensation Seeing   0.542005
Alcohol           -0.460093
Amphetamines       0.966766
Amyl Nitrite       0.448868
Benzodiazepine     0.396163
Caffeine          -0.586237
Canabis            1.066917
Chocolate         -0.850983
Cocaine            1.251042
Ecstasy            2.490815
Ketamine           0.924695
Legal Highs        1.070598
LSD                1.124148
Mushrooms          1.049582
Nicotine           0.517200
Semer             -0.569879
Heroin 24                      Heroin
Age               -1.083237
Sex               -1.143190
Education         -1.137117
Country           -1.769579
Ethnicity         -0.450206
Extraversion      -0.924309
Agreeableness     -1.271122
Conscientiousness -1.226796
Alcohol           -0.746008
Amphetamines       0.776787
Benzodiazepine     1.032459
Caffeine          -0.559001
Chocolate         -0.917563
Cocaine            1.021088
Crack              1.418124
Ecstasy            0.409694
Heroin             3.242629
Ketamine           0.347425
Legal Highs        0.318841
LSD                0.462904
Methadone          1.229626
Mushrooms          0.419519
Semer             -0.550219
VSA                0.372350
Ketamine 24                    Ketamine
Age               -1.609272
Sex               -1.482609
Education         -1.071489
Country           -1.168711
Ethnicity         -0.593560
Neuroticism       -0.470141
Extraversion      -0.646612
Agreeableness     -1.165683
Conscientiousness -1.336923
Amphetamines       0.775874
Amyl Nitrite       0.656195
Benzodiazepine     0.468940
Caffeine          -0.665570
Canabis            0.517548
Chocolate         -0.826410
Cocaine            1.048734
Ecstasy            1.315998
Heroin             0.278915
Ketamine           3.286900
Legal Highs        0.925252
LSD                0.921570
Mushrooms          0.922074
Nicotine           0.304104
Semer             -0.483569
Legal Highs 24                    Legal Highs
Age                  -1.905834
Sex                  -1.570871
Education            -1.305606
Country              -1.918121
Ethnicity            -0.342330
Extraversion         -0.701326
Agreeableness        -1.021906
Conscientiousness    -1.379464
Sensation Seeing      0.682741
Alcohol              -0.497569
Amphetamines          0.899633
Benzodiazepine        0.532537
Caffeine             -0.563437
Canabis               1.146647
Chocolate            -0.758534
Cocaine               0.705855
Ecstasy               1.146916
Ketamine              0.699219
Legal Highs           2.540059
LSD                   0.890100
Methadone             0.430418
Mushrooms             1.077191
Nicotine              0.481358
Semer                -0.534542
LSD 24                         LSD
Age               -1.655357
Sex               -1.508198
Education         -1.228797
Country           -2.206700
Neuroticism       -0.490216
Extraversion      -0.551179
Openness           0.581186
Agreeableness     -0.912066
Conscientiousness -1.127243
Sensation Seeing   0.567584
Alcohol           -0.570640
Amphetamines       0.741758
Benzodiazepine     0.365323
Caffeine          -0.624211
Canabis            1.068437
Chocolate         -0.861611
Cocaine            0.627138
Ecstasy            1.229141
Ketamine           0.710574
Legal Highs        0.910282
LSD                2.610982
Mushrooms          1.543741
Nicotine           0.331508
Semer             -0.380206
Methadone 24                    Methadone
Age                -1.240293
Sex                -1.200067
Education          -1.189178
Country            -2.044143
Extraversion       -0.980958
Agreeableness      -1.109752
Conscientiousness  -1.236320
Alcohol            -0.824506
Amphetamines        0.922662
Amyl Nitrite       -0.325659
Benzodiazepine      1.368426
Caffeine           -0.449860
Canabis             0.546429
Chocolate          -0.697708
Cocaine             0.725218
Crack               0.769754
Ecstasy             0.415054
Heroin              1.218051
Legal Highs         0.655662
LSD                 0.442967
Methadone           3.130309
Mushrooms           0.481325
Semer              -0.548798
VSA                 0.413763
Mushrooms 24                    Mushrooms
Age                -1.690986
Sex                -1.501169
Education          -1.232011
Country            -2.179068
Neuroticism        -0.502639
Extraversion       -0.570137
Openness            0.533751
Agreeableness      -0.990490
Conscientiousness  -1.243640
Sensation Seeing    0.562760
Alcohol            -0.550110
Amphetamines        0.720370
Benzodiazepine      0.455391
Caffeine           -0.510167
Canabis             1.202346
Chocolate          -0.871738
Cocaine             0.715953
Ecstasy             1.099587
Ketamine            0.663539
Legal Highs         1.049736
LSD                 1.483662
Mushrooms           2.534704
Nicotine            0.387513
Semer              -0.322692
Nicotine 24                    Nicotine
Age               -1.585401
Sex               -1.364454
Education         -1.575166
Country           -1.709497
Ethnicity         -0.334410
Extraversion      -0.704420
Agreeableness     -1.055187
Conscientiousness -1.512497
Sensation Seeing   0.535685
Alcohol           -0.393432
Amphetamines       0.713644
Benzodiazepine     0.527440
Caffeine          -0.147552
Canabis            1.335532
Chocolate         -0.785497
Cocaine            0.900728
Ecstasy            0.821113
Ketamine           0.345802
Legal Highs        0.671182
LSD                0.484531
Mushrooms          0.602109
Nicotine           3.186563
Semer             -0.528976
VSA                0.342464
Semer 24                       Semer
Age               -0.554092
Sex               -0.195398
Education         -0.487993
Country           -0.650448
Neuroticism       -0.280097
Openness          -0.119749
Conscientiousness -0.222620
Impulsivness      -0.207664
Sensation Seeing  -0.000184
Alcohol           -0.474215
Amphetamines      -0.103591
Amyl Nitrite      -0.234040
Caffeine          -0.333149
Canabis           -0.022826
Chocolate         -0.493377
Cocaine           -0.008836
Ecstasy           -0.050164
Ketamine           0.064158
LSD                0.130962
Methadone         -0.292079
Mushrooms          0.288047
Nicotine          -0.119830
Semer              5.366112
VSA                0.045397
VSA 24                         VSA
Age               -1.595559
Sex               -1.183045
Education         -1.163495
Country           -1.768690
Ethnicity         -0.228411
Extraversion      -0.744407
Agreeableness     -1.093681
Conscientiousness -1.288853
Sensation Seeing   0.475482
Alcohol           -0.493714
Amphetamines       0.555772
Benzodiazepine     0.576475
Caffeine          -0.356427
Canabis            0.564016
Chocolate         -0.921132
Cocaine            0.610012
Ecstasy            0.479539
Heroin             0.499035
Legal Highs        0.772545
LSD                0.628617
Methadone          0.510873
Nicotine           0.494514
Semer             -0.361534
VSA                3.700016
In [23]:
def prediction(modelo,drug,params=None):
  model=modelo(**(params or {}))
  X=df2[dfs[f'df_{drug}'].index]
  Y=df2[drug]
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=37)
  model.fit(X_train,Y_train)
  Y_pred=model.predict(X_test)
  class_rep=classification_report(Y_test,Y_pred)
  return(drug,class_rep)
In [24]:
rff=RandomForestClassifier
class_rep_rff={}
knn=KNeighborsClassifier
class_rep_knn={}
svc=SVC
class_rep_svc={}
In [25]:
for drug in columns_to_categorize:
  d,c=prediction(rff, drug)
  class_rep_rff[d]=c
  d,c=prediction(svc,drug)
  class_rep_svc[d]=c
  d,c=prediction(knn, drug)
  class_rep_knn[d]=c
In [26]:
def compare_reports(cr1,cr2):
  L1=cr1.split('\n')
  L2=cr2.split('\n')

  headers = ["precision", "recall", "f1-score", "support"]
  C=[]
  for i in range(3):
    C1=[line.split()[i] for line in L1 if line.strip()]
    C.append(C1[0])

  header_line = "{:<20} {:>12} {:>12} {:>12}".format("", "Model 1", "Model 2", "Difference")
  print(header_line)
  print("=" * len(header_line))

  for i in range(len(C)):

    val1 = [float(L1[12].split()[i]) for i in range(2,5)]
    val2 = [float(L2[12].split()[i]) for i in range(2,5)]

    diff_val = [v1 - v2 for v1, v2 in zip(val1, val2)]


    line = "{:<20} {:>12} {:>12} {:>12}".format(C[i], str(val1[i]), str(val2[i]), str(diff_val[i]))
    print(line)
In [27]:
compare_reports(class_rep_rff['Alcohol'],class_rep_svc['Alcohol'])
                          Model 1      Model 2   Difference
===========================================================
precision                    0.94         0.92 0.019999999999999907
recall                       0.93         0.94 -0.009999999999999898
f1-score                     0.92         0.93 -0.010000000000000009
In [28]:
compare_reports(class_rep_rff['Alcohol'],class_rep_knn['Alcohol'])
                          Model 1      Model 2   Difference
===========================================================
precision                    0.94         0.58         0.36
recall                       0.93          0.6 0.33000000000000007
f1-score                     0.92         0.58 0.3400000000000001
In [29]:
compare_reports(class_rep_knn['Alcohol'],class_rep_svc['Alcohol'])
                          Model 1      Model 2   Difference
===========================================================
precision                    0.58         0.92 -0.3400000000000001
recall                        0.6         0.94 -0.33999999999999997
f1-score                     0.58         0.93 -0.3500000000000001
In [30]:
knn_params = {'n_neighbors':[3, 5, 7], 'weights':['uniform', 'distance']}
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
rf_params = {'n_estimators': [50, 100, 150],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
In [31]:
def create_classifier(modelo,**params):
    return modelo(**params)
In [32]:
def prediction_grid_search(modelo,drug,params):
  model=modelo()
  X=df2[dfs[f'df_{drug}'].index]
  Y=df2[drug]
  X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=37)

  grid_search=GridSearchCV(create_classifier(modelo),params, cv=5, scoring='accuracy')
  grid_search.fit(X_train,Y_train)

  best_params=grid_search.best_params_

  best_model=create_classifier(modelo,**best_params)

  best_model.fit(X_train,Y_train)

  Y_pred=best_model.predict(X_test)

  accuracy = accuracy_score(Y_test, Y_pred)

  return [accuracy, best_params]
In [33]:
bring={'Drug':columns_to_categorize,'RFF': [0]*len(columns_to_categorize),
       "Best_params_rff": [0]*len(columns_to_categorize),
       "KNN": [0]*len(columns_to_categorize),'Best_params_knn': [0]*len(columns_to_categorize),
       'SVC': [0]*len(columns_to_categorize),'Best_params_svc':[0]*len(columns_to_categorize)}
final_df=pd.DataFrame(data=bring)
final_df
Out[33]:
Drug RFF Best_params_rff KNN Best_params_knn SVC Best_params_svc
0 Alcohol 0 0 0 0 0 0
1 Amphetamines 0 0 0 0 0 0
2 Amyl Nitrite 0 0 0 0 0 0
3 Benzodiazepine 0 0 0 0 0 0
4 Caffeine 0 0 0 0 0 0
5 Canabis 0 0 0 0 0 0
6 Chocolate 0 0 0 0 0 0
7 Cocaine 0 0 0 0 0 0
8 Crack 0 0 0 0 0 0
9 Ecstasy 0 0 0 0 0 0
10 Heroin 0 0 0 0 0 0
11 Ketamine 0 0 0 0 0 0
12 Legal Highs 0 0 0 0 0 0
13 LSD 0 0 0 0 0 0
14 Methadone 0 0 0 0 0 0
15 Mushrooms 0 0 0 0 0 0
16 Nicotine 0 0 0 0 0 0
17 Semer 0 0 0 0 0 0
18 VSA 0 0 0 0 0 0
In [ ]:
for drug in columns_to_categorize:
  final_df.loc[final_df['Drug']==drug,['RFF','Best_params_rff']]=prediction_grid_search(rff,drug,rf_params)
  final_df.loc[final_df['Drug']==drug,['KNN','Best_params_knn']]=prediction_grid_search(knn,drug,knn_params)
  final_df.loc[final_df['Drug']==drug,['SVC','Best_params_svc']]=prediction_grid_search(svc,drug,svc_params)
In [ ]:
final_df.to_csv('final_df.csv', index=False)

Drug Usage Prediction Model¶

We will create a machine learning model based on the RandomForest model, which predicts the probability of a person using drugs at least once. The aim is to predict whether a person has already taken certain drugs using only visible factors.

Data Preparation

We define columns for demographic data, psychological factors and the common drug used.

In [34]:
df_standard = df_standard.astype('str')

base_columns = ['Age','Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion',
                'Openness', 'Agreeableness', 'Conscientiousness']

drugs = ['Alcohol', 'Amphetamines', 'Amyl Nitrite', 'Benzodiazepine', 'Caffeine', 'Canabis',
         'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs',
         'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']

model_data = df_standard[base_columns + drugs]

Encoding and Model Training

We encode our categorical data and train a RandomForestClassifier for each drug. Drugs are encode in binary whereas the others with LabelEncoder.

In [35]:
# Encoding
label_encoders = {}
for col in base_columns + drugs:
    label_encoders[col] = LabelEncoder()
    model_data[col] = label_encoders[col].fit_transform(model_data[col].apply(lambda x: 0 if x == 'Never Used' else 1 if col in drugs else x))

# Train/test split
models = {}
for drug in drugs:
    X = model_data.drop(drugs, axis=1)
    y = model_data[drug]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    models[drug] = model

Prediction Function

We define a function to predict the probability of a person using drugs at least once, based on demographic and psychological factors.

In [36]:
def predict_drug_use_probability(age, sex, education, country, ethnicity, neuroticism, extraversion, openness, agreeableness, conscientiousness):
    encoded_input = {
        'Age': label_encoders['Age'].transform([age])[0],
        'Sex': label_encoders['Sex'].transform([sex])[0],
        'Education': label_encoders['Education'].transform([education])[0],
        'Country': label_encoders['Country'].transform([country])[0],
        'Ethnicity': label_encoders['Ethnicity'].transform([ethnicity])[0],
        'Neuroticism': label_encoders['Neuroticism'].transform([neuroticism])[0],
        'Extraversion': label_encoders['Extraversion'].transform([extraversion])[0],
        'Openness': label_encoders['Openness'].transform([openness])[0],
        'Agreeableness': label_encoders['Agreeableness'].transform([agreeableness])[0],
        'Conscientiousness': label_encoders['Conscientiousness'].transform([conscientiousness])[0]
    }
    input_df = pd.DataFrame([encoded_input])
    probabilities = {}
    for drug, model in models.items():
        probability = model.predict_proba(input_df)[0][1]
        probabilities[drug] = probability
    return probabilities

Example of use

In [37]:
profile =df_standard[base_columns].iloc[0]
print(f"Profile:\n {profile}\n")
example_probabilities = predict_drug_use_probability(*profile)
Profile:
 Age                                              35-44
Sex                                             Female
Education            Professional certificate/ diploma
Country                                             UK
Ethnicity                            Mixed-White/Asian
Neuroticism                        Average Neuroticism
Extraversion                      Average Extraversion
Openness                              Average Openness
Agreeableness                    Average Agreeableness
Conscientiousness            Average Conscientiousness
Name: 1, dtype: object

Here, we have the probability of having taken at least once these drugs for this profile

In [38]:
print(f"Probability of having taken at least once \n")
for drug, probability in example_probabilities.items():
    print(f"{drug}: {probability * 100:.2f}%")
Probability of having taken at least once 

Alcohol: 99.50%
Amphetamines: 87.57%
Amyl Nitrite: 8.16%
Benzodiazepine: 81.85%
Caffeine: 100.00%
Canabis: 16.32%
Chocolate: 98.83%
Cocaine: 4.00%
Crack: 0.00%
Ecstasy: 2.36%
Heroin: 1.00%
Ketamine: 1.53%
Legal Highs: 2.95%
LSD: 1.83%
Methadone: 0.00%
Mushrooms: 2.30%
Nicotine: 86.50%
Semer: 0.00%
VSA: 4.25%

This prediction makes perfect sense, especially when it comes to alcohol, caffeine and chocolate, which are common drugs!

Compare the prediction of the profile with its actual data

We will use a binary approach by admitting that all probabilty above 50 % signify that the profile already use the drug.

In [39]:
def compare_predictions_with_actual(profile, example_probabilities):
    comparison = {}
    correct_predictions_count = 0
    for drug in example_probabilities:
        actual_use = profile[drug] != 'Never Used'
        predicted_use = example_probabilities[drug] >= 0.5
        correct_prediction = actual_use == predicted_use
        comparison[drug] = {
            'Actual Use': actual_use,
            'Predicted Use': predicted_use,
            'Correct Prediction': correct_prediction
        }
        if correct_prediction:
            correct_predictions_count += 1
    return comparison, correct_predictions_count

def calculate_precision(correct_predictions_count, total_drugs):
    if total_drugs > 0:
        precision = (correct_predictions_count / total_drugs) * 100
    else:
        precision = 0
    return precision

profile_with_drugs = df_standard.iloc[0]
comparison_results, correct_predictions_count = compare_predictions_with_actual(profile_with_drugs, example_probabilities)
for result in comparison_results:
    print(f"{result}: {comparison_results[result]}")

precision = calculate_precision(correct_predictions_count, len(drugs))
print(f"Precision: {precision:.2f}%")
Alcohol: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amphetamines: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amyl Nitrite: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Benzodiazepine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Caffeine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Canabis: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Chocolate: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Cocaine: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Crack: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ecstasy: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Heroin: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ketamine: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Legal Highs: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
LSD: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Methadone: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Mushrooms: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Nicotine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Semer: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
VSA: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Precision: 100.00%

We can see a precision of 100 percent which is really good !

Compare the prediction of the profile with its actual data

We will use a binary approach by admitting that all probabilty above 50% signify that the profile already uses the drug.

In [40]:
def convert_probabilities_to_predictions(probabilities, threshold=0.5):
    return [1 if prob >= threshold else 0 for prob in probabilities]

X_test = model_data.drop(drugs, axis=1)
y_test_actuals = {}
y_test_predictions = {}

precisions = {}
for drug in drugs:
    model = models[drug]
    y_test_actuals[drug] = model_data[drug]
    probabilities = model.predict_proba(X_test)[:, 1]
    predictions = convert_probabilities_to_predictions(probabilities)
    y_test_predictions[drug] = predictions
    precision = precision_score(y_test_actuals[drug], predictions)
    precisions[drug] = precision

for drug, precision in precisions.items():
    print(f"Precision for {drug}: {precision * 100:.2f}%")
average_precision = sum(precisions.values()) / len(precisions)
print(f"Average Precision: {average_precision * 100:.2f}%")
Precision for Alcohol: 99.46%
Precision for Amphetamines: 86.76%
Precision for Amyl Nitrite: 84.77%
Precision for Benzodiazepine: 88.10%
Precision for Caffeine: 99.62%
Precision for Canabis: 93.40%
Precision for Chocolate: 99.52%
Precision for Cocaine: 84.54%
Precision for Crack: 90.24%
Precision for Ecstasy: 87.12%
Precision for Heroin: 92.34%
Precision for Ketamine: 86.88%
Precision for Legal Highs: 90.34%
Precision for LSD: 88.35%
Precision for Methadone: 83.49%
Precision for Mushrooms: 87.69%
Precision for Nicotine: 90.86%
Precision for Semer: 100.00%
Precision for VSA: 84.85%
Average Precision: 90.44%

Drug Use Prediction Model with common drugs (alcohol, caffeine, chocolate, nicotine)

We will create a machine learning model based on the RandomForest model, which predicts the probability of a person using drugs at least once, based on demographic and psychological factors and a few common drugs.

Data Preparation

We add the common drugs.

In [41]:
df_standard = df_standard.astype('str')

base_columns = ['Age', 'Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion',
                'Openness', 'Agreeableness', 'Conscientiousness', 'Alcohol', 'Caffeine', 'Nicotine', 'Chocolate']

drugs = ['Amphetamines', 'Amyl Nitrite', 'Benzodiazepine', 'Canabis',
          'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs',
         'LSD', 'Methadone', 'Mushrooms', 'Semer', 'VSA']

model_data = df_standard[base_columns + drugs]

Encoding and Model Training

Drugs are encode in binary whereas the others with LabelEncoder.

In [42]:
# Encoding the drug columns and other categorical columns
label_encoders = {}
for col in base_columns + drugs:
    label_encoders[col] = LabelEncoder()
    if col in drugs:
        model_data[col] = label_encoders[col].fit_transform(model_data[col].apply(lambda x: '0' if x == 'Never Used' else '1'))
    else:
        model_data[col] = label_encoders[col].fit_transform(model_data[col])

models = {}
for drug in drugs:
    X = model_data.drop(drugs, axis=1)
    y = model_data[drug]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    models[drug] = model

Prediction Function

We define a function to predict the probability of a person using drugs at least once, based on demographic and psychological factors, but also on common drugs such as alcohol, chocolate, caffeine and nicotine.

In [43]:
def predict_drug_use_probability(age, sex, education, country, ethnicity, neuroticism, extraversion, openness, agreeableness, conscientiousness,alcohol,caffeine,nicotine,chocolate):
    encoded_input = {
        'Age': label_encoders['Age'].transform([age])[0],
        'Sex': label_encoders['Sex'].transform([sex])[0],
        'Education': label_encoders['Education'].transform([education])[0],
        'Country': label_encoders['Country'].transform([country])[0],
        'Ethnicity': label_encoders['Ethnicity'].transform([ethnicity])[0],
        'Neuroticism': label_encoders['Neuroticism'].transform([neuroticism])[0],
        'Extraversion': label_encoders['Extraversion'].transform([extraversion])[0],
        'Openness': label_encoders['Openness'].transform([openness])[0],
        'Agreeableness': label_encoders['Agreeableness'].transform([agreeableness])[0],
        'Conscientiousness': label_encoders['Conscientiousness'].transform([conscientiousness])[0],
        'Alcohol': label_encoders['Alcohol'].transform([alcohol])[0],
        'Caffeine': label_encoders['Caffeine'].transform([caffeine])[0],
        'Nicotine': label_encoders['Nicotine'].transform([nicotine])[0],
        'Chocolate': label_encoders['Chocolate'].transform([chocolate])[0]
    }
    input_df = pd.DataFrame([encoded_input])

    probabilities = {}
    for drug, model in models.items():
        probability = model.predict_proba(input_df)[0][1]
        probabilities[drug] = probability
    return probabilities

Example of use :

In [44]:
profile =df_standard[base_columns].iloc[1]
print(f"Profile:\n {profile}\n")
Profile:
 Age                                      25-34
Sex                                       Male
Education                     Doctorate degree
Country                                     UK
Ethnicity                                White
Neuroticism                Average Neuroticism
Extraversion                 High Extraversion
Openness                         High Openness
Agreeableness            Average Agreeableness
Conscientiousness    Average Conscientiousness
Alcohol                      Used in Last Week
Caffeine                      Used in Last Day
Nicotine                    Used in Last Month
Chocolate                     Used in Last Day
Name: 2, dtype: object

In [45]:
example_probabilities = predict_drug_use_probability(*profile)
print(f"Probability of having take at least one time \n")
for drug, probability in example_probabilities.items():
    print(f"{drug}: {probability * 100:.2f}%")
Probability of having take at least one time 

Amphetamines: 85.00%
Amyl Nitrite: 86.00%
Benzodiazepine: 19.00%
Canabis: 99.00%
Cocaine: 81.00%
Crack: 8.00%
Ecstasy: 87.00%
Heroin: 1.00%
Ketamine: 82.00%
Legal Highs: 11.00%
LSD: 82.00%
Methadone: 73.00%
Mushrooms: 26.00%
Semer: 0.00%
VSA: 7.00%

Let's evaluate the precision of the model with the same profile and for each drugs

We will estimate that if the probability of this profile to have already used a drug is more than 50 percent, then he has used it once before.

In [46]:
def compare_predictions_with_actual(profile, example_probabilities):
    comparison = {}
    correct_predictions_count = 0
    for drug in example_probabilities:
        actual_use = profile[drug] != 'Never Used'
        predicted_use = example_probabilities[drug] >= 0.5
        correct_prediction = actual_use == predicted_use
        comparison[drug] = {
            'Actual Use': actual_use,
            'Predicted Use': predicted_use,
            'Correct Prediction': correct_prediction
        }
        if correct_prediction:
            correct_predictions_count += 1
    return comparison, correct_predictions_count

def calculate_precision(correct_predictions_count, total_drugs):
    if total_drugs > 0:
        precision = (correct_predictions_count / total_drugs) * 100
    else:
        precision = 0
    return precision

profile_with_drugs = df_standard.iloc[0]
comparison_results, correct_predictions_count = compare_predictions_with_actual(profile_with_drugs, example_probabilities)
for result in comparison_results:
    print(f"{result}: {comparison_results[result]}")

precision = calculate_precision(correct_predictions_count, len(drugs))
print(f"Precision: {precision:.2f}%")
Amphetamines: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amyl Nitrite: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Benzodiazepine: {'Actual Use': True, 'Predicted Use': False, 'Correct Prediction': False}
Canabis: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Cocaine: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Crack: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ecstasy: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Heroin: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ketamine: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Legal Highs: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
LSD: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Methadone: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Mushrooms: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Semer: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
VSA: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Precision: 46.67%

We have also 100% percent of precision here

Now let's try to evaluate the model more globally

In [47]:
cv_scores = {}
for drug in drugs:
    X = model_data.drop(drugs, axis=1)
    y = model_data[drug]
    model = RandomForestClassifier(random_state=42)

    accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()

    cv_scores[drug] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

for drug, scores in cv_scores.items():
    print(f"{drug}: {scores}")
Amphetamines: {'Accuracy': 0.6726790450928382, 'Precision': 0.6660439373265128, 'Recall': 0.670001821383037, 'F1 Score': 0.657977597793498}
Amyl Nitrite: {'Accuracy': 0.7177718832891247, 'Precision': 0.5625714129064093, 'Recall': 0.3844827586206897, 'F1 Score': 0.44940824855895656}
Benzodiazepine: {'Accuracy': 0.6668435013262599, 'Precision': 0.6601218258640413, 'Recall': 0.6033898305084746, 'F1 Score': 0.6214519345673828}
Canabis: {'Accuracy': 0.8482758620689654, 'Precision': 0.8836849516181594, 'Recall': 0.9279534186555978, 'F1 Score': 0.9044743352317953}
Cocaine: {'Accuracy': 0.683289124668435, 'Precision': 0.6643628950577647, 'Recall': 0.6326627218934912, 'F1 Score': 0.6352663678838143}
Crack: {'Accuracy': 0.8567639257294429, 'Precision': 0.41208791208791207, 'Recall': 0.08521870286576169, 'F1 Score': 0.136518615032766}
Ecstasy: {'Accuracy': 0.6965517241379311, 'Precision': 0.6865948215323375, 'Recall': 0.6755948380158623, 'F1 Score': 0.6621820411597504}
Heroin: {'Accuracy': 0.8461538461538461, 'Precision': 0.5454220779220779, 'Recall': 0.10357142857142856, 'F1 Score': 0.16262739639189236}
Ketamine: {'Accuracy': 0.7777188328912465, 'Precision': 0.5032612270714738, 'Recall': 0.13924050632911394, 'F1 Score': 0.20133835785026816}
Legal Highs: {'Accuracy': 0.7517241379310344, 'Precision': 0.7352082346594845, 'Recall': 0.6788153809410079, 'F1 Score': 0.6837768343386229}
LSD: {'Accuracy': 0.7172413793103448, 'Precision': 0.7142869927473644, 'Recall': 0.6616339967080652, 'F1 Score': 0.6524936903810927}
Methadone: {'Accuracy': 0.7809018567639257, 'Precision': 0.6657684759871472, 'Recall': 0.3770664118490205, 'F1 Score': 0.4276268756824034}
Mushrooms: {'Accuracy': 0.7013262599469496, 'Precision': 0.7011485864773364, 'Recall': 0.6995089011663597, 'F1 Score': 0.6783927227060144}
Semer: {'Accuracy': 0.9957559681697612, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0}
VSA: {'Accuracy': 0.7575596816976129, 'Precision': 0.5785185185185185, 'Recall': 0.15348837209302327, 'F1 Score': 0.21560516198253513}

We can also draw a ROC curve for one drug to see the accuracy of the prediction on this drug :

In [48]:
X = model_data.drop(drugs, axis=1)
y = model_data['Amphetamines']
model = RandomForestClassifier(random_state=42)
y_scores = cross_val_predict(model, X, y, cv=5, method='predict_proba')[:, 1]

fpr, tpr, thresholds = roc_curve(y, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

We can see that the model is pretty accurate

Now let's evaluate if using binary predictions after the other model improves the prediction :

In [49]:
def convert_probabilities_to_predictions(probabilities, threshold=0.5):
    return [1 if prob >= threshold else 0 for prob in probabilities]

X_test = model_data.drop(drugs, axis=1)
y_test_actuals = {}
y_test_predictions = {}

precisions = {}
for drug in drugs:
    model = models[drug]
    y_test_actuals[drug] = model_data[drug]
    probabilities = model.predict_proba(X_test)[:, 1]
    predictions = convert_probabilities_to_predictions(probabilities)
    y_test_predictions[drug] = predictions
    precision = precision_score(y_test_actuals[drug], predictions)
    precisions[drug] = precision

for drug, precision in precisions.items():
    print(f"Precision for {drug}: {precision * 100:.2f}%")

average_precision = sum(precisions.values()) / len(precisions)
print(f"Average Precision: {average_precision * 100:.2f}%")
Precision for Amphetamines: 92.70%
Precision for Amyl Nitrite: 94.24%
Precision for Benzodiazepine: 93.87%
Precision for Canabis: 97.00%
Precision for Cocaine: 92.40%
Precision for Crack: 95.59%
Precision for Ecstasy: 92.11%
Precision for Heroin: 96.30%
Precision for Ketamine: 94.63%
Precision for Legal Highs: 94.17%
Precision for LSD: 92.87%
Precision for Methadone: 92.94%
Precision for Mushrooms: 93.44%
Precision for Semer: 100.00%
Precision for VSA: 94.81%
Average Precision: 94.47%

Conclusion :¶

To conclude, the model's efficiency notably improves by 5% when it incorporates common drugs into its prediction profile, marking a substantial enhancement. It adeptly combines psychological, demographic, and common drug usage data to accurately predict drug exposure risks. This approach is vital for pinpointing individuals at risk, particularly when such risk isn't immediately apparent. Demonstrating its practical use, the model effectively identifies potential drug users by analyzing a range of influencing factors.

Data Visualization¶

In [50]:
avg_drug_usage_per_country=df4.groupby('Country')[columns_to_categorize].mean()

def country_plot_drug_usage(country):
  plt.figure(figsize=(27,20))
  df_country=avg_drug_usage_per_country.loc[country]
  plt.bar(avg_drug_usage_per_country.columns,df_country)
  plt.title(f"Drug usage in {country}")
  plt.xlabel("Drugs")
  plt.ylabel("Mean Percentage")
  plt.show()
for country in df4['Country'].unique():
  country_plot_drug_usage(country)

Impact of the feature on drug consumption¶

In [51]:
def impact_feature(feature):
  for i,col in enumerate(['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']):
    graph = px.histogram(df4, x = col, color=feature,barmode='group', title=f'Impact of the {feature} on '+col+' consumption')
    graph.show()
In [52]:
impact_feature('Country')

Percentage of people surveyed¶

In [53]:
df1 = df3.copy(deep=True)
def survey_percentage(feature):
  counts_by_country = df1[feature].value_counts()
  df1[f'{feature}WithCount'] = df1[feature].apply(lambda x: f"{x} ({counts_by_country[x]})")

  fig = px.pie(df1, names=f'{feature}WithCount', color=f'{feature}WithCount', title=f"Percentage of people surveyed by {feature}")
  fig.show()
In [54]:
Things=['Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing']
for col in Things:
  survey_percentage(col)

Relation between features¶

In [57]:
def box_relation(feat1,feat2):
  sns.boxplot(x=feat1, y=feat2, data=df2)
  plt.xticks(rotation=45)
  plt.show()
In [58]:
box_relation("Sex","Impulsivness") #Men are more impulsive than women